/*
 * Copyright 2010 Tom Gibara
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0

 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
 */
package com.tomgibara.crinch.hashing;

import java.math.BigInteger;
import java.util.Arrays;
import java.util.Comparator;

/**
 * <p>
 * A "minimal perfect hash" for Strings. After construction with an array of
 * <em>n</em> unique non-null strings, an instance of this class will return a
 * unique hash value <em>h</em> (0 <= h < n) for any string <em>s</em> in the
 * array. A negative has value will typically be returned for a string that is
 * not in the array.
 * </p>
 * 
 * <p>
 * However, the supplied array is <em>not</em> retained. This means that the
 * implementation cannot necessarily confirm that a string is not in the
 * supplied array. Where this implementation cannot distinguish that a string is
 * not in the array, a 'valid' hash value may be returned. Under no
 * circumstances will a hash value be returned that is greater than or equal to
 * <em>n</em>.
 * </p>
 * 
 * <p>
 * <strong>IMPORTANT NOTE:<strong> The array of strings supplied to the
 * constructor will be mutated: it is re-ordered so that
 * <code>hash(a[i]) == i</code>. Application code must generally use this
 * information to map hash values back onto the appropriate string value.
 * </p>
 * 
 * <p>
 * <strong>NOTE:<strong> Good performance of this algorithm is predicated on
 * string hash values being cached by the <code>String</code> class. Experience
 * indicates that is is a good assumption.
 * </p>
 * 
 * 
 * @author Tom Gibara
 */

public class PerfectStringHash implements Hash<String> {

	// statics
	
	/**
	 * Comparator used to order the supplied string array. Hashcodes take
	 * priority, we will do a binary search on those. Otherwise, lengths take
	 * priority over character ordering because the hash algorithm prefers to
	 * compare lengths, it's faster.
	 */
	
	private static final Comparator<String> comparator = new Comparator<String>() {
		@Override
		public int compare(String s1, String s2) {
			final int h1 = s1.hashCode();
			final int h2 = s2.hashCode();
			if (h1 == h2) {
				final int d = s1.length() - s2.length();
				return d == 0 ? s1.compareTo(s2) : d;
			}
			return h1 < h2 ? -1 : 1;
		}
	};

	/**
	 * Builds a (typically v. small) decision tree for distinguishing strings
	 * that share the same hash value.
	 * 
	 * @param values
	 *            the string values to distinguish
	 * @param start
	 *            the index from which the values should be read
	 * @param length
	 *            the number of string values that need to be distinguished
	 * @param pivots
	 *            the array that will hold our decision nodes
	 * @param pivotIndex
	 *            the index at which the tree should be written
	 */
	private static void generatePivots(String[] values, int start, int length, int[] pivots, int pivotIndex) {
		final int capacity = Integer.highestOneBit(length - 1) << 1;
		final int depth = Integer.numberOfTrailingZeros(capacity);
		pivots[ pivotIndex << 1     ] = depth;
		pivots[(pivotIndex << 1) + 1] = length;
		pivotIndex++;
		//build the array
		for (int i = 0; i < depth; i++) {
			int step = capacity >> i;
			for (int j = (1 << (depth-i-1)) - 1; j < capacity; j += step) {
				final int part;
				final int comp;
				if (j >= length - 1) {
					part = Integer.MIN_VALUE;
					comp = 0;
				} else {
					final String v1 = values[start + j];
					final String v2 = values[start + j + 1];
					final int l1 = v1.length();
					final int l2 = v2.length();
					if (l1 == l2) {
						int tPart = -1;
						int tComp = -1;
						for (int k = 0; k < l1; k++) {
							final char c1 = v1.charAt(k);
							final char c2 = v2.charAt(k);
							if (c1 == c2) continue;
							if (c1 < c2) { //must occur at some point because we have already checked that the two strings are unequal
								tPart = k;
								tComp = c1;
							} else {
								//shouldn't be possible - we've sorted the strings to avoid this case
								throw new IllegalStateException();
							}
							break;
						}
						//check if we've been passed a duplicated value
						if (tPart == -1) throw new IllegalArgumentException("duplicate value: " + v1);
						part = tPart;
						comp = tComp;
					} else {
						part = -1;
						comp = l1;
					}
				}
				pivots[ pivotIndex<<1     ] = part;
				pivots[(pivotIndex<<1) + 1] = comp;
				pivotIndex++;
			}
		}
	}
	
	// fields
	
	/**
	 * The hashcodes of the supplied strings.
	 */
	
	private final int[] hashes;
	
	/**
	 * Stores two ints for every string, an offset into the pivot array (-1 if
	 * not necessary) and the depth of the decision tree that is rooted there.
	 */
	
	private final int[] offsets;
	
	/**
	 * Stores two ints for every decision, the index at which a character
	 * comparison needs to be made, followed by the character value to be
	 * compared against; or -1 to indicate a length comparison, followed by the
	 * length to be compared against.
	 */
	
	private final int[] pivots;

	/**
	 * Cache a range object which indicates the range of hash values generated.
	 */
	
	private final HashRange range;
	
	/**
	 * Constructs a minimal perfect string hashing over the supplied strings.
	 * 
	 * @param values
	 *            an array of unique non-null strings that will be reordered
	 *            such that <code>hash(values[i]) == i</code>.
	 */
	
	public PerfectStringHash(final String... values) {
		final int length = values.length;
		if (length == 0) throw new IllegalArgumentException("No values supplied");
		
		final int[] hashes = new int[length];
		final int[] offsets = new int[2 * length];
		final int[] runLengths = new int[length];
		
		//sort values so that we can assume ordering by hashcode, length and char[]
		Arrays.sort(values, comparator);
		
		//pull the hashcodes into an array for analysis
		for (int i = 0; i < length; i++) hashes[i] = values[i].hashCode();
		
		//test for unique hashes
		int offset = 0;
		if (length > 1) {
			int previousHash = hashes[0];
			int runLength = 1;
			for (int i = 1; i <= length; i++) {
				int currentHash = i == length ? ~previousHash : hashes[i];
				if (currentHash == previousHash) {
					runLength++;
				} else {
					if (runLength > 1) {
						final int firstIndex = i - runLength;
						for (int j = i - 1; j >= firstIndex; j--) {
							runLengths[j] = runLength;
							//offset points to the first node in decision tree
							offsets[ j<<1     ] = offset;
							//adjustment is number of indices to first duplicate
							offsets[(j<<1) + 1] = j - firstIndex;
						}
						//extra one for recording depth
						offset += (Integer.highestOneBit(runLength - 1) << 1);
						runLength = 1;
					} else {
						runLengths[i-1] = 1;
						offsets[(i-1)<<1] = -1;
					}
				}
				previousHash = currentHash;
			}
		}

		//shortcut for when all hashes are unique
		if (offset == 0) {
			this.hashes = hashes;
			this.offsets = null;
			this.pivots = null;
			this.range = new HashRange(0, length - 1);
			return;
		}

		//build the decision trees
		final int[] pivots = new int[offset * 2];
		for (int i = 0; i < length;) {
			final int runLength = runLengths[i];
			if (runLength > 1)  generatePivots(values, i, runLength, pivots, (int) offsets[i << 1]);
			i += runLength;
		}
		
		//setup our state
		this.pivots = pivots;
		this.offsets = offsets;
		this.hashes = hashes;
		this.range = new HashRange(0, length - 1);
	}
	
	// hash generator methods
	
	@Override
	public HashRange getRange() {
		return range;
	}
	
	@Override
	public BigInteger hashAsBigInt(String value) {
		return BigInteger.valueOf(hash(value));
	}
	
	//TODO decide whether to throw an IAE if -1 is returned from hash
	@Override
	public int hashAsInt(String value) {
		return hash(value);
	}
	
	@Override
	public long hashAsLong(String value) {
		return hash(value);
	}
	
	/**
	 * Generates a hashcode for the supplied string.
	 * 
	 * @param value
	 *            any string, not null
	 * @return a minimal hashcode for the supplied string, or -1
	 */

	private int hash(String value) {
		final int h = value.hashCode();
		final int index = Arrays.binarySearch(hashes, h);
		final int[] pivots = this.pivots;
		if (pivots == null || index < 0) return index;

		final int offset = offsets[index << 1];
		if (offset == -1) return index;
		
		final int depth = pivots[(offset << 1)    ];
		final int count = pivots[(offset << 1) + 1];
		int i = 0;
		for (int d = 0; d < depth; d++) {
			final int t = (offset + (1 << d) + i) << 1;
			final int part = pivots[t    ];
			final int comp = pivots[t + 1];
			final boolean right;
			if (part == Integer.MIN_VALUE) { //easy case - no right value
				right = false;
			} else if (part == -1) { //compare length
				right = value.length() > comp;
			} else { //lengths are equal, compare character
				right = value.charAt(part) > (char) comp;
			}
			i <<= 1;
			if (right) i++;
		}
		return i >= count ? -1 : index + i - offsets[(index << 1) + 1];
	}
	
}
